**************************************************
* FILTER MASTERFILE TO GEOGRAPHIC BEATS
**************************************************

clear
use "masterfile.dta", clear

* 1. Keep only days when officer was present and has an address
keep if assgn_present == "true" & add_has_address == 1

* 2. Restrict to Police Officer, Sergeant, Lieutenant ranks
gen rank2 = substr(off_rank, 1, 2)
keep if inlist(rank2, "PO", "SG", "SE", "LI")
drop rank2

* 3. Restrict to standard shifts (1–4) and geographic districts (1–25)
keep if inrange(assgn_shift, 1, 4)
keep if assgn_unit <= 25

* 4. Normalize beat code: strip leading zeros
rename assgn_beat beat_raw
while substr(beat_raw, 1, 1) == "0" {
    replace beat_raw = substr(beat_raw, 2, .)
}

* 5. Extract numeric part (4- or 3-digit)
gen beat4 = regexs(1) if regexm(beat_raw, "([0-9]{4})")
gen beat3 = regexs(1) if regexm(beat_raw, "([0-9]{3})")
replace beat4 = beat3 if missing(beat4)
rename beat4 beat_num
destring beat_num, replace

* 6. Flag geographic beats via merge with CPD beat list
merge m:1 beat_num using "cpd-beats.dta", keepusing(beat_num)
gen assgn_beat_geographic = (_merge == 3)
drop _merge beat3

* 7. Restore beat code and clean malformed codes
rename beat_raw assgn_beat
gen to_drop = missing(assgn_beat)                             ///
           | inlist(assgn_beat, "000", "0000", "00000", "000000") ///
           | length(assgn_beat) <= 2                             ///
           | length(assgn_beat) > 6

* Drop beats that begin with non‑numeric characters
gen firstchar = substr(assgn_beat, 1, 1)
destring firstchar, force replace
replace to_drop = 1 if missing(firstchar)
drop firstchar

drop if to_drop
drop to_drop

* 8. Final subset to geographic beats only
keep if assgn_beat_geographic == 1

* 9. Save result
save "masterfile-in-geo-beats.dta", replace
